## Demand estimation ##
## Author: Jeronimo Callejas ##
## First Version: March 10th 2021 ##
## Last Version: March 10th 2021##
#%% Loading packages
"""
Loading packages 

"""
#import pyblp
import numpy as np
import pandas as pd
import os
import json
import difflib
import matplotlib as plt
from scipy.spatial import distance
import seaborn as sns

#%% Functions

def fillnan (ds1,ds2,char):
    """
    This function takes one dataset (ds1) and fild nas of each column in char 
    using data from a sectond dataset (ds2).
    
    """
    tar={}
    ds1.dropna(subset = ['Sub Model'], inplace=True)
    char2=["dist","Engine (ccm)","Generation Year"]
    temp1=ds1
    v_u=temp1.VerYear.unique()
        
    for obs in v_u:
        tar[obs]={}
        mdl=temp1[temp1.VerYear==obs]['Sub Model'].unique()[0]
        mdl2=temp1[temp1.VerYear==obs]['Version'].unique()[0]
        variables=temp1[temp1.VerYear==obs].mode()
        lst_1=variables.iloc[0][char2].fillna(1)
        pm=list(set(difflib.get_close_matches(mdl, ds2.model_1,n=15,cutoff=0.85)))
            
        temp2=ds2[ds2['model_1'].isin(pm)]
            
        if temp2.shape[0]<1:
            for cc in char:
                tar[obs][cc]=np.nan
                print(mdl)
        else: 
            #temp2['dist']=temp2.apply(lambda row: vect.fit_transform([mdl2, row['Version']]).toarray(), axis=1 )
            temp2['dist']=  temp2.apply(lambda row:difflib.SequenceMatcher(None, mdl2, row['Version']).ratio() , axis=1 )
            temp2['c_dist']=temp2.apply(lambda row:distance.cosine(row[char2].tolist(),lst_1) , axis=1 )
            #ff=min(len(temp2),2)
            for cc in char:
                target=temp2.sort_values(by=['c_dist'])[:1][cc].iloc[0]
                #target=temp2.sort_values(by=['c_dist'])[:ff][cc].median()
                tar[obs][cc]=target
    return(tar)
    
def fillnan_p (ds1,ds2,char):
    """
    This function takes one dataset (ds1) and fild nas in the price column 
    using data from a sectond dataset (ds2).
    
    """
    tar={}
    ds1.dropna(subset = ['Q'], inplace=True) 
    char2=["dist","Year","Engine (ccm)"]
    temp1=ds1
    #v_u=temp1.VerYear.unique()
    v_u=temp1.Match.unique()
        
    for obs in v_u:
        #mdl=temp1[temp1.VerYear==obs]['Match'].unique()[0]
        mdl=obs
        tar[obs]={}
        #mdl2=temp1[temp1.VerYear==obs]['Version'].unique()[0]
        mdl2=temp1[temp1.Match==obs]['Version'].unique()[0]
        #yr=temp1[temp1.VerYear==obs]['Year'].unique()[0]
        yr=temp1[temp1.Match==obs]['Year'].unique()[0]
        #variables=temp1[temp1.VerYear==obs]
        variables=temp1[temp1.Match==obs]
        variables=variables.drop(variables[variables.Year!=yr].index)
        lst_1=variables.iloc[0][char2].fillna(1)
        pm=list(set(difflib.get_close_matches(mdl, ds2.Match,n=15,cutoff=0.90)))
            
        temp2=ds2[ds2['Match'].isin(pm)]
        temp2=temp2.drop(temp2[temp2.Year!=yr].index)
        
        if temp2[char2[-1]].isnull().values.any():
            char2=char2[:-1]
            lst_1=lst_1[:-1]
            
        if temp2.shape[0]<1:
            for cc in char:
                tar[obs][cc]=np.nan
                print(mdl)
        else: 
            #temp2['dist']=temp2.apply(lambda row: vect.fit_transform([mdl2, row['Version']]).toarray(), axis=1 )
            temp2['dist']=  temp2.apply(lambda row:difflib.SequenceMatcher(None, mdl2, row['Version']).ratio() , axis=1 )
            temp2['c_dist']=temp2.apply(lambda row:distance.cosine(row[char2].tolist(),lst_1) , axis=1 )
            #ff=min(len(temp2),2)
            for cc in char:
                target=temp2.sort_values(by=['c_dist'])[:1][cc].iloc[0]
                #target=temp2.sort_values(by=['c_dist'])[:ff][cc].median()
                tar[obs][cc]=target
    price_df=pd.DataFrame.from_dict(tar,orient='index')
    return(price_df)                  
def fillnan_p2 (ds1,ds2,char):
    """
    This function takes one dataset (ds1) and fild nas of each column in char 
    using data from a sectond dataset (ds2).
    
    """
    tar={}
    temp1=ds1
    v_u=temp1.Match2.unique()       
    for obs in v_u:
        tar[obs]={}
        yr=temp1[temp1.Match2==obs]['Year'].unique()[0]
        pm=list(set(difflib.get_close_matches(obs, ds2.Match2,n=15,cutoff=0.97)))
            
        temp2=ds2[ds2['Match2'].isin(pm)]
        temp2=temp2.drop(temp2[temp2.Year!=yr].index)
                    
        if temp2.shape[0]<1:
            for cc in char:
                tar[obs][cc]=np.nan
                print(mdl)
        else: 
            for cc in char:
                target=temp2[cc].mean()
                #target=temp2.sort_values(by=['c_dist'])[:ff][cc].median()
                tar[obs][cc]=target
                
    price_df=pd.DataFrame.from_dict(tar,orient='index')
    return(price_df)
#%% Loading Data and preparing data
"""
Data preparation

In this section, data is loaded. Also, some variables are changed from
categorical to binary.

"""

car_data=pd.read_csv('data_col_ini.csv',encoding="ISO-8859-1") # Initial data from IHS (has no prices)
car_data['Version']=car_data['Version'].str.upper() # Change version names to upper cases
car_data['VerYear']= [i + j for i, j in zip(car_data['Version'], car_data['Generation Year'].apply(str))]
car_data['Model']=car_data['Model'].str.upper() # Change model names to upper cases
car_data['Make']=car_data['Make'].str.upper() # Change make names to upper cases
car_data['Sub Model']=car_data['Sub Model'].str.upper() # Change make names to upper cases
car_data=car_data.replace('Unspecified',np.nan) # Replace unspecified by nan
car_data=car_data.replace(-1,np.nan)# Replace -1 by nan
car_data['Axle Configuration']=car_data['Axle Configuration'].replace('2 axles',' 4 X 2') # Consolidate 4x2 characteristic
car_data['Sub Model']=car_data['Sub Model'].replace('HYUNDAI GRAND I10','HYUNDAI I10') # Consolidate 4x2 characteristic
car_data['Version']=car_data['Version'].replace('HYUNDAI GRAND I10 1.0','HYUNDAI I10 1.0') # Consolidate 4x2 characteristic
car_data['No. of Seats']=car_data['No. of Seats'].str.rstrip('_Seats') #remove word Seats from data
car_data['Model']=car_data['Model'].str.rstrip('-CLASS')
car_data['Version']=car_data['Version'].str.rstrip(' (MIDSIZED)')
car_data['Sub Model']=car_data['Sub Model'].str.rstrip(' (MIDSIZED)')
car_data['Turbo']=car_data['Turbo'].agg(lambda x: 1 if x=='Turbo' else (0 if x=='Non turbo' else np.nan)) # Turbo==1
car_data['Axle Configuration']=car_data['Axle Configuration'].agg(lambda x: 1 if x==' 4 X 2' else (0 if x==' 4 X 4' else np.nan)) # 4x2==1
car_data['Transmission']=car_data['Transmission'].agg(lambda x: 1 if x=='Automatic' else (0 if x=='Manual' else np.nan)) # Automatic==1
car_data['Driven Wheels']=car_data['Driven Wheels'].astype('category')
car_data['Emissions']=np.nan



#%% Sample data
sample_data=pd.read_csv('sample.csv',encoding="ISO-8859-1") # Sample data consisting on subset of months. Inlcudes prices
sample_data['Version']=sample_data['Version'].str.upper() # Change version names to upper cases
sample_data['VerYear']= [i + j for i, j in zip(sample_data['Version'], sample_data['Generation Year'].apply(str))]
sample_data['Model']=sample_data['Model'].str.upper() # Change model names to upper cases
sample_data['Make']=sample_data['Make'].str.upper() # Change make names to upper cases
sample_data=sample_data.replace('Unspecified',np.nan) # Replace unspecified by nan
sample_data=sample_data.replace(-1,np.nan)# Replace -1 by nan
sample_data['Axle Configuration']=sample_data['Axle Configuration'].replace('2 axles',' 4 X 2') # Consolidate 4x2 characteristic
sample_data['Sub Model']=sample_data['Sub Model'].replace('HYUNDAI GRAND I10','HYUNDAI I10') # Consolidate 4x2 characteristic
sample_data['Version']=sample_data['Version'].replace('HYUNDAI GRAND I10 1.0','HYUNDAI I10 1.0') # Consolidate 4x2 characteristic
sample_data['No. of Seats']=sample_data['No. of Seats'].str.rstrip('_Seats') #remove word Seats from data
sample_data['Model']=sample_data['Model'].str.rstrip('-CLASS')
sample_data['Version']=sample_data['Version'].str.rstrip(' (MIDSIZED)')
sample_data['Sub Model']=sample_data['Sub Model'].str.rstrip(' (MIDSIZED)')
sample_data['Turbo']=sample_data['Turbo'].agg(lambda x: 1 if x=='Turbo' else (0 if x=='Non turbo' else np.nan)) # Turbo==1
sample_data['Axle Configuration']=sample_data['Axle Configuration'].agg(lambda x: 1 if x==' 4 X 2' else (0 if x==' 4 X 4' else np.nan)) # 4x2==1
sample_data['Transmission']=sample_data['Transmission'].agg(lambda x: 1 if x=='Automatic' else (0 if x=='Manual' else np.nan)) # Automatic==1
sample_data['Driven Wheels']=sample_data['Driven Wheels'].astype('category')


#%% load datra from scrapy

#with open("carspecs.json", "r") as f:
#    for i, line in enumerate(f):
#        try:
#            pass
#        except:
#            print('Error on line', i + 1, ':\n')
f = open("carspecs.json",)           
tmp=json.load(f)
f.close

scrapy={}

for dd in tmp:
    scrapy.update(dd)

specs=pd.DataFrame.from_dict(scrapy, orient='index')
specs.reset_index(inplace=True)

em=open('emissions.json',)
tmp=json.load(em)
em.close


emissions={}
for dd in tmp:
    emissions.update(dd)

emissions=pd.DataFrame.from_dict(emissions, orient='index')
emissions.reset_index(inplace=True)
emissions['index']=emissions['index'].str.replace('id','car-')

specs=pd.merge(specs,emissions,on='index')


specs["Axle Configuration"]=specs.layout.str.strip().str[-1]
specs['Driven Wheels']=specs['Axle Configuration'].agg(lambda x: "Front" if x=='F' else ("Rear" if x=='R' else ("AWD" if x== 'A' or '4' else np.nan))) 
specs['transmition']=specs.gear_box.str.extract('([a-zA-Z ]+)')
specs['gear_box']=specs.gear_box.str.extract('(\d+)')

specs.doors=specs.doors.str.rstrip(' door')
specs.emissions=specs.emissions.str.rstrip(' ')
specs.power=specs.power.str.rstrip(' ')
specs.weight=specs.weight.str.rstrip(' ')
specs.volume=specs.volume.str.rstrip(' ')

specs=specs.fillna(-1)

specs.year=specs.year.astype(int)
specs.doors=specs.doors.astype(int)
specs.emissions=specs.emissions.astype(float)
specs.power=specs.power.astype(float)
specs.weight=specs.weight.astype(int)
specs.volume=specs.volume.astype(int)
specs.gear_box=specs.gear_box.astype(int)

specs.manufacturer= specs.manufacturer.str.upper()
specs.model= specs.model.str.upper()


specs= specs.query("year > 1964 or year < 0" )

specs=specs.rename(columns={"fuel": "Fuel Type", "doors": "No. of Doors",
                            "transmition":"Transmission", "turbo":"Turbo",
                            "volume":"Engine (ccm)","year":"Generation Year",
                            "power":"Engine (HP)","emi":"Emissions",
                            "gear_box":"No. of Gears", "weight":"Gross Vehicle Weight" })

specs['model_1']= specs['model'].astype(str).str.split().str[0]
specs['model_1']= [i + j for i, j in zip(specs['manufacturer']+' ',specs['model_1'])]
specs['Version']= [i + j for i, j in zip(specs['manufacturer']+' ',specs['model'].astype(str))]
specs['No. of Cylinders']=specs.others.str[0]
specs['Turbo']=specs['Turbo'].agg(lambda x: 1 if x=='turbocharged' else (1 if x=='turbocharged and supercharged' else 0)) # Turbo==1
specs['Driven Wheels']=specs['Axle Configuration'].agg(lambda x: "Front" if x=='F' else ("Rear" if x=='R' else "AWD")) 
specs['Axle Configuration']=specs['Axle Configuration'].agg(lambda x: 1 if x=='F' else (1 if x=='R' else 0)) # Turbo==1
specs['Transmission']=specs['Transmission'].replace('AM',np.nan)
specs['Transmission']=specs['Transmission'].replace({'A': 1,'SA':1,'AOD':1,
     'WP':1,'X':1,'AT':1,'a':1,'M':0,'OD':0, 'CVT':0,'SM':0})



specs=specs.replace(-1,np.nan)# Replace -1 by nan

#%% Load price data
prices_IHS=pd.read_csv('Colombi_p.csv',encoding="ISO-8859-1")
prices_IHS['Version']=prices_IHS['Version'].str.upper() # Change version names to upper cases
prices_IHS['Model']=prices_IHS['Model'].str.upper() # Change model names to upper cases
prices_IHS['Make']=prices_IHS['Make'].str.upper() # Change make names to upper cases
prices_IHS=prices_IHS.replace('Unspecified',np.nan) # Replace unspecified by nan
prices_IHS=prices_IHS.replace(-1,np.nan)# Replace -1 by nan
prices_IHS['Sub Model']=prices_IHS['Sub Model'].replace('HYUNDAI GRAND I10','HYUNDAI I10') # Consolidate 4x2 characteristic
prices_IHS['Version']=prices_IHS['Version'].replace('HYUNDAI GRAND I10 1.0','HYUNDAI I10 1.0') # Consolidate 4x2 characteristic
prices_IHS['Model']=prices_IHS['Model'].str.rstrip('-CLASS')
prices_IHS['Version']=prices_IHS['Version'].str.rstrip(' (MIDSIZED)')
prices_IHS['Sub Model']=prices_IHS['Sub Model'].str.rstrip(' (MIDSIZED)')
prices_IHS['Match2']= [i + j for i, j in zip(prices_IHS['Version'], ' '+ prices_IHS['Year'].apply(str))]

#%% Analysis of how many enrtries have certain characteristic

headers= list(car_data.columns)
#car_data['country_num']=car_data.groupby('Country/Territory-Name')["id"].transform("count")
car_data=car_data.replace(-1,np.nan)
car_data=car_data.replace('Unspecified',np.nan)

headers.remove('Country/Territory-Name')
char_data=pd.DataFrame()
char_data=car_data.groupby('Country/Territory-Name')[headers].agg(lambda x: x.count()/x.size)

char_data.to_excel("char.xlsx")  
#%% Filling Nans
# Check variation of characteristics inside a version
"""
First: Check same version + year in Dataset
Second: Check same version + year in Sample
Third: Check same Model in EV dataset
Forth: Check same Model in data from website

"""

dt={}
chars=['No. of Gears','Gross Vehicle Weight','Engine (ccm)','Engine (HP)',\
       'Turbo','Axle Configuration','No. of Cylinders','Transmission',\
       'Fuel Type','Body Type','Body Group','Driven Wheels']
chars2=['Axle Configuration','Body Type','Body Group','Driven Wheels']

## Get some statistics from charateristics disperssion

unique_ver=car_data.VerYear.unique()
    
for ver in unique_ver:
    dt[ver]= car_data[car_data.VerYear==ver][chars]\
        .agg('mode')
    #.agg([lambda x: x.std()/x.mean() if (x.mean()>0) else 0,'median','size'])


## Create a dataset with median value of characteristics for each version
median_char=pd.DataFrame()
for mdl in dt:
   median_char=median_char.append(pd.DataFrame(dt[mdl].iloc[0]).transpose(),ignore_index=True)
    #ls.extend(dt[mdl].iloc[0].tolist())

median_char.index=list(dt.keys()) 

for ch in median_char:
    car_data[ch] = car_data[ch].fillna(car_data['VerYear'].map(median_char[ch].to_dict() ))




# fill nans with sample data
dt2={} 

unique_ver_2=sample_data.VerYear.unique()

for ver2 in unique_ver_2:
    dt2[ver2]= sample_data[sample_data.VerYear==ver2][chars]\
        .agg('mode')

median_char_2=pd.DataFrame()
for mdl2 in dt2:
   median_char_2=median_char_2.append(pd.DataFrame(dt2[mdl2].iloc[0]).transpose(),ignore_index=True)
    #ls.extend(dt[mdl].iloc[0].tolist())

median_char_2.index=list(dt2.keys()) 

for ch in median_char_2:
    car_data[ch] = car_data[ch].fillna(car_data['VerYear'].map(median_char_2[ch].to_dict() ))
    
#%% Filling nans with data from internet

chars3=['No. of Gears','Gross Vehicle Weight','Engine (HP)','Emissions',
       'Turbo','Axle Configuration','No. of Cylinders','Transmission',
       'Driven Wheels','No. of Doors','Generation Year']

    
tar2=fillnan (car_data,specs,chars3) 

median_char_3=pd.DataFrame.from_dict(tar2,orient='index')
    

for ch3 in chars3:
    car_data[ch3] = car_data[ch3].fillna(car_data['VerYear'].map(median_char_3[ch3].to_dict() ))

# filling nan with car submodel
 
#dt3={} 
#
#unique_ver_3=car_data['Sub Model'].unique()
#
#for ver3 in unique_ver_3:
#    dt3[ver3]= car_data[car_data['Sub Model']==ver3][chars2]\
#        .agg('mode')
#
#median_char_3=pd.DataFrame()
#for mdl3 in dt3:
#   median_char_3=median_char_3.append(pd.DataFrame(dt3[mdl3].iloc[0]).transpose(),ignore_index=True)
#    #ls.extend(dt[mdl].iloc[0].tolist())
#
#median_char_3.index=list(dt3.keys()) 
#
#for ch in median_char_2:
#    car_data[ch] = car_data[ch].fillna(car_data['VerYear'].map(median_char_2[ch].to_dict() ))
#
#    

#car_data.to_excel("data2.xlsx")  
    
#%% Transform data from wide format to long format
years=['Q2015','Q2016','Q2017','Q2018','Q2019','Q2020','Q2021'];
car_data["id"] = car_data.index

# Transform data from wide to long (in two steps)
car_data_w=pd.wide_to_long(car_data, years, i="id", j="Month",sep='/')
car_data_w.reset_index(inplace=True) 
car_data_w["id"] = car_data_w.index
cars=pd.wide_to_long(car_data_w, 'Q', i="id", j="Year")
cars.reset_index(inplace=True)
cars['Total Sales']=cars.groupby(['Country/Territory-Name','Year','Month'])['Q'].transform('sum')
cars['MS']=cars['Q']/cars['Total Sales']
cars=cars.drop(cars[cars.Q==0].index)
cars.dropna(subset = ['Q'], inplace=True)


#%% Colombian prices

l_col=cars[cars['Country/Territory-Name']=='Colombia'] 
l_col['Match']= [i + j for i, j in zip(l_col['Sub Model'], ' '+ l_col['Year'].apply(str))]
l_col['Match2']= [i + j for i, j in zip(l_col['Version'], ' '+ l_col['Year'].apply(str))]
#l_col['Match']= [i + j for i, j in zip(l_col['Version'], ' '+ l_col['Year'].apply(str))]
l_col=l_col.sort_values(by=['Year','Month','MS'],ascending=False)

col_prices=pd.read_csv('COL_P.csv',encoding="ISO-8859-1")
col_prices['Match']= [i + j + k for i, j, k in zip(col_prices['Make'], ' '+col_prices['Submodel'], ' '+ col_prices['Year'].apply(str))]
col_prices['Version']= [i + j for i, j in zip(col_prices['Make'], ' '+col_prices['Model 2'].apply(str))]

#l_col['MSRPDA']=np.nan
#l_col['MSRPD']=np.nan

prices_D=fillnan_p2 (l_col,prices_IHS,['MSRPD'])
l_col['MSRPD'] = l_col['MSRPD'].fillna(l_col['Match2'].map(prices_D['MSRPD'].to_dict() ))

prices_A=fillnan_p (l_col,col_prices,['MSRPDA'])
prices_D=fillnan_p2 (l_col,prices_IHS,['MSRPD'])


l_col['MSRPD'] = l_col['MSRPD'].fillna(l_col['Match2'].map(prices_D['MSRPD'].to_dict() ))
l_col['MSRPDA'] = l_col['MSRPDA'].fillna(l_col['Match'].map(prices_A['MSRPDA'].to_dict() ))
l_col['MSRPDA'] = l_col['MSRPDA'].fillna(l_col['Match2'].map(prices_D['MSRPD'].to_dict() ))

l_col.to_excel("Col_data.xlsx")  

nan_df=l_col[l_col['Engine (HP)'].isna()]

nan_df['Total MS']=nan_df.groupby(['Country/Territory-Name','Year','Month'])['MS'].transform('sum')
nan_df['sum_ms']=nan_df.groupby(['VerYear'])['MS'].transform('sum')

nan_df=nan_df.sort_values(by=('sum_ms'),ascending=False)

l_col.to_excel(("Col_data_F.xlsx")) 
